...
On Crowdflower, each revision is rated 10 times. The raters are given three questions:
In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
from __future__ import division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from crowdflower_analysis import *
from krippendorf_alpha import *
from krippendorf_alpha_grrrr import *
In [2]:
pd.set_option('display.max_colwidth', 1000)
In [3]:
dat = pd.read_csv('../../../../data/annotations/nda/nda onion layer 5 raters 10.csv')
In [4]:
dat = dat[dat['_golden'] == False]
# Replace missing data with 'False'
dat = dat.replace(np.nan, False, regex=True)
attack_columns = ['not_attack', 'other', 'quoting', 'recipient', 'third_party']
for col in attack_columns:
dat[col] = create_column_of_counts(dat['is_harassment_or_attack'], col)
In [5]:
chosen_ids = set(dat['rev_id'].unique()[0:1000])
In [6]:
sub_dat = dat[dat['rev_id'].apply(lambda x: x in chosen_ids)]
In [7]:
groups = sub_dat.groupby('_worker_id')
In [8]:
data = []
for g in groups:
df =g[1][['rev_id', 'recipient']]
d ={}
for i, row in df.iterrows():
d[row['rev_id']] = row['recipient']
data.append(d)
In [9]:
krippendorff_alpha(data, metric = nominal_metric)
Out[9]:
In [10]:
cleaned_df = clean_df(sub_dat)
In [11]:
Krippendorf_alpha(cleaned_df, ['not_attack_0', 'not_attack_1'])
Out[11]:
In [12]:
'''
for key in grouped_dat.keys():
print "Krippendorf's Alpha (aggressiveness) for layer %s: " % key
print Krippendorf_alpha(grouped_dat[key], aggressive_columns, distance = interval_distance)
print "Krippendorf's Alpha (attack) for layer %s: " % key
print Krippendorf_alpha(grouped_dat[key], ['not_attack_0', 'not_attack_1'])
'''
Out[12]: